import pandas as pd
df= pd.read_csv('retail_combined_data.csv')import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalizationcustomer_encoder = LabelEncoder()
product_encoder = LabelEncoder()
df["customer_id"] = customer_encoder.fit_transform(df["customer_id"])
df["product_id"] = product_encoder.fit_transform(df["product_id"])
df| customer_id | product_id | purchase_score | name | description | |
|---|---|---|---|---|---|
| 0 | 269 | 2 | 5 | Headphones | Noise-canceling wireless headphones |
| 1 | 53 | 4 | 2 | Smartwatch | Smartwatch with health tracking features |
| 2 | 412 | 3 | 4 | Camera | DSLR camera with 4K video recording |
| 3 | 130 | 0 | 5 | Laptop | High-performance laptop with latest processor |
| 4 | 609 | 1 | 4 | Smartphone | Feature-rich smartphone with excellent camera |
| ... | ... | ... | ... | ... | ... |
| 19995 | 52 | 3 | 2 | Camera | DSLR camera with 4K video recording |
| 19996 | 788 | 2 | 4 | Headphones | Noise-canceling wireless headphones |
| 19997 | 396 | 1 | 2 | Smartphone | Feature-rich smartphone with excellent camera |
| 19998 | 193 | 3 | 3 | Camera | DSLR camera with 4K video recording |
| 19999 | 398 | 3 | 1 | Camera | DSLR camera with 4K video recording |
20000 rows × 5 columns
df['product_id'].unique()array([2, 4, 3, 0, 1], dtype=int64)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)num_customers = df["customer_id"].nunique()+1
num_products = df["product_id"].nunique()+1
embedding_dim = 10# Input layers
customer_input = Input(shape=(1,), name="customer_input")
product_input = Input(shape=(1,), name="product_input")
# Embeddings
customer_embedding = Embedding(num_customers, embedding_dim, name="customer_embedding")(customer_input)
product_embedding = Embedding(num_products, embedding_dim, name="product_embedding")(product_input)
customer_vec = Flatten()(customer_embedding)
product_vec = Flatten()(product_embedding)
# Merge
collab_layer = Concatenate()([customer_vec, product_vec])
dense_layer = Dense(128, activation='relu')(collab_layer)
dense_layer = Dense(64, activation='relu')(dense_layer)
collab_output = Dense(1, activation='linear', name="collab_output")(dense_layer)
collab_model = Model(inputs=[customer_input, product_input], outputs=collab_output)
collab_model.compile(optimizer='adam', loss='mse', metrics=['mae'])collab_model.fit(
[train_data['customer_id'], train_data['product_id']], train_data['purchase_score'],
validation_data=([test_data['customer_id'], test_data['product_id']], test_data['purchase_score']),
epochs=1, batch_size=200
)80/80 ━━━━━━━━━━━━━━━━━━━━ 5s 13ms/step - loss: 7.4246 - mae: 2.3178 - val_loss: 2.0279 - val_mae: 1.2307
<keras.src.callbacks.history.History at 0x2154008d110>
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
raw_text = np.hstack([df.name.str.lower(), df.description.str.lower()])
tokenizer = Tokenizer()
tokenizer.fit_on_texts(raw_text)
description_token= tokenizer.texts_to_sequences(df.description.str.lower())
name_token = tokenizer.texts_to_sequences(df.name.str.lower())
#description_token_test= tokenizer.texts_to_sequences(test_data.description.str.lower())
#name_token_test = tokenizer.texts_to_sequences(test_data.name.str.lower())max_description_token_length = pd.Series(description_token).map(len).max()
max_name_token_length = pd.Series(name_token).map(len).max()
vocab_size = np.max(np.concatenate([np.concatenate(description_token),np.concatenate(name_token)]))+1from tensorflow.keras.preprocessing.sequence import pad_sequences
desc_padded = pad_sequences(description_token, maxlen=5)
name_padded = pad_sequences(name_token, maxlen=2)# Content based model
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
desc_input = Input(shape=(5,), name="desc_input")
name_input= Input(shape=(2,), name="name_input")
desc_embedding = Embedding(input_dim=vocab_size, output_dim=5)(desc_input)
name_embedding = Embedding(input_dim=vocab_size, output_dim=5)(name_input)
desc_rnn = GRU(16)(desc_embedding)
name_rnn = GRU(8)(name_embedding)
merged = Concatenate()([desc_rnn, name_rnn])
dense_layer = Dense(64, activation='relu')(merged)
dense_layer = Dense(64, activation='relu')(dense_layer)
content_output = Dense(1, activation='linear', name="content_output")(dense_layer)name_train, name_test, desc_train, desc_test = train_test_split(
name_padded, desc_padded, test_size=0.2, random_state=42
)print(name_train.shape)
print(desc_train.shape)
print(train_data['purchase_score'].shape) (16000, 2)
(16000, 5)
(16000,)
# content based model
content_model = Model(inputs=[name_input,desc_input], outputs=content_output)
content_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
train_data['purchase_score']= np.array(train_data['purchase_score'])
content_model.fit([name_train,desc_train], train_data['purchase_score'],validation_data=([name_test,desc_test],
test_data['purchase_score']),epochs=1, batch_size=2)8000/8000 ━━━━━━━━━━━━━━━━━━━━ 63s 7ms/step - loss: 2.2415 - mae: 1.2799 - val_loss: 2.0064 - val_mae: 1.2101
<keras.src.callbacks.history.History at 0x21543fbdc90>
# Combine collaborative and content models
merged_layer = Concatenate()([collab_output, content_output])
hybrid_dense = Dense(64, activation='relu')(merged_layer)
hybrid_output = Dense(1, activation='linear', name="hybrid_output")(hybrid_dense)
hybrid_model = Model(inputs=[customer_input, product_input,name_input, desc_input], outputs=hybrid_output)
hybrid_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
hybrid_model.fit(
[train_data['customer_id'], train_data['product_id'],name_train, desc_train],
train_data['purchase_score'],
validation_data=([test_data['customer_id'], test_data['product_id'],name_test, desc_test], test_data['purchase_score']),
epochs=2, batch_size=20
)Epoch 1/2
800/800 ━━━━━━━━━━━━━━━━━━━━ 25s 12ms/step - loss: 2.4399 - mae: 1.3189 - val_loss: 2.0067 - val_mae: 1.2160
Epoch 2/2
800/800 ━━━━━━━━━━━━━━━━━━━━ 8s 8ms/step - loss: 2.0284 - mae: 1.2332 - val_loss: 2.0411 - val_mae: 1.2343
<keras.src.callbacks.history.History at 0x2155275e590>
customer_map = {customer_id: idx for idx, customer_id in enumerate(train_data['customer_id'].unique())}
product_map = {product_id: idx for idx, product_id in enumerate(train_data['product_id'].unique())}
def recommend_products(customer_id, top_n=3):
customer_index = customer_map.get(customer_id, None)
if customer_index is None:
return "Customer not found."
product_indices = np.array(list(product_map.values()))
# Select only relevant padded sequences
filtered_names = name_padded[product_indices]
filtered_descs = desc_padded[product_indices]
# Predict content scores for only mapped products
content_scores = content_model.predict([filtered_names, filtered_descs]).flatten()
# Predict scores for all products
collab_scores = collab_model.predict([np.array([customer_index] * len(all_products)), all_products]).flatten()
# Predict content scores
content_scores = content_model.predict([name_padded,desc_padded]).flatten()
# Hybrid score: weighted sum of collaborative & content scores
hybrid_scores = 0.5 * collab_scores + 0.5 * content_scores
# Get top N recommendations
top_indices = np.argsort(-hybrid_scores)[:top_n]
recommended_products = [list(product_map.keys())[i] for i in top_indices]
return recommended_products# Get recommendations
print(recommend_products(270, top_n=2))1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 83ms/step
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 88ms/step
[2, 4]